---
title: "Copernicus Marine Extraction"
params:
#yml: "meta/copernicus_phy.yml"
yml: "meta/copernicus_bgc-car.yml"
subtitle: "metadata: *`r basename(params$yml)`*"
---
```{r}
#| label: setup
librarian:: shelf (
dplyr, DT, fs, glue, here, logger, jsonlite, listviewer, lubridate, mapview,
purrr, RColorBrewer, readr, reticulate, sf, stringr, terra, tibble, tidyr,
units, yaml,
quiet = T)
options (readr.show_col_types = F)
mapviewOptions (
basemaps = "Esri.OceanBasemap" ,
vector.palette = colorRampPalette (brewer.pal (n= 5 , name= "Spectral" )))
if (! exists ("params" ))
# params <- list(yml = here("meta/copernicus_phy.yml")) # DEBUG
params <- list (yml = here ("meta/copernicus_bgc-car.yml" )) # DEBUG
cm_r_dflyrs <- function (r){
# CopernicusMarine raster to dataframe of layers
d_lyrs <- tibble (
lyr = names (r)) |>
separate_wider_regex (
lyr, c (
dataset = "^ \\ S+" , " " ,
var = " \\ S+" ,
depth = "(?: \\ sdepth=[.0-9]+)?" ,
itime = " \\ s[0-9]+$" )) |>
mutate (
depth = str_replace (depth, "depth=" , "" ) |> as.numeric (),
itime = as.integer (itime),
time = time (r))
d_lyrs
}
```
## Polygons
```{r}
#| label: polygons
sanctuaries_rds <- here ("../climate-dashboard/data/sanctuaries.rds" )
buf_pct <- 0.20 # 20% of area
ply_sanctuaries <- readRDS (sanctuaries_rds)
# buffer polygon and get bounding boxes
ply_sanctuaries <- ply_sanctuaries |>
mutate (
area_km2 = st_area (geom) |>
set_units ("km^2" ) |>
as.numeric () |>
round (2 ),
buf_km = (sqrt (area_km2) * buf_pct) |>
round (2 ),
bbox_geom = map2 (geom, area_km2, \(g,a) {
st_sfc (g, crs = 4326 ) |>
st_buffer (buf_km * 1000 ) |>
st_bbox () |>
round (2 ) |>
st_as_sfc () }),
bbox_chr = map_chr (bbox_geom, \(x) { # xmin, ymin, xmax and ymax
st_bbox (x) |>
as.numeric () |>
paste (collapse = "," ) } ) ) |>
unnest (bbox_geom)
bb_sanctuaries <- st_set_geometry (ply_sanctuaries, "bbox_geom" ) |>
select (- geom)
ply_sanctuaries <- select (ply_sanctuaries, - bbox_geom)
# show map
mapView (ply_sanctuaries, zcol = "nms" ) +
mapView (bb_sanctuaries, zcol = "nms" , legend = F)
ply_sanctuaries |>
st_drop_geometry () |>
datatable ()
```
## Dataset
### Metadata
Contents of `r basename(params$yml)` :
```{r}
#| label: yml
#| results: asis
cat (
"```yaml" ,
readLines (params$ yml),
"```" ,
sep = " \n " )
```
- [ Python interface - Copernicus Marine Toolbox documentation ](https://toolbox-docs.marine.copernicus.eu/en/v2.0.1/python-interface.html#copernicusmarine.subset)
- [ Command subset - Copernicus Marine Toolbox documentation ](https://toolbox-docs.marine.copernicus.eu/en/v2.0.1/usage/subset-usage.html#option-coordinates-selection-method)
```{r}
#| label: dataset
# import copernicusmarine as cmt ----
# do once: create virtual enviroment and install copernicusmarine Python module
# virtualenv_create(envname = "CopernicusMarine")
# virtualenv_install(envname = "CopernicusMarine", packages = c("copernicusmarine"))
# use CopernicusMarine env with copernicusmarine Python module
use_virtualenv (virtualenv = "CopernicusMarine" , required = TRUE )
py_require ("copernicusmarine>2.1" )
cmt <- import ("copernicusmarine" )
# login to cmt ----
# register for username and password at https://data.marine.copernicus.eu/register
user <- "bbest1"
pass_txt <- ifelse (
Sys.info ()[["sysname" ]] == "Linux" ,
"/share/private/data.marine.copernicus.eu_bbest1-password.txt" , # server
"~/My Drive/private/data.marine.copernicus.eu_bbest1-password.txt" ) # laptop
pass <- readLines (pass_txt)
logged_in <- cmt$ login (user, pass, force_overwrite = T) # py_help(cmt$login)
stopifnot (logged_in)
meta <- read_yaml (params$ yml)
copernicus_prod <- path_ext_remove (basename (params$ yml))
dir_out <- glue ("/share/data/noaa-onms/climate-dashboard-app/{copernicus_prod}" )
dir_create (dir_out)
```
```{r}
# get times ----
get_dataset_description <- function (dataset_id){
cmt$ describe (
dataset_id = dataset_id,
disable_progress_bar = T)$ json () |>
fromJSON ()
}
get_dataset_times <- function (dataset_description){
services <- pluck (
dataset_description, "products" , "datasets" , 1 , "versions" , 1 , "parts" , 1 , "services" , 1 )
service_names <- pluck (
services, "service_name" )
i_timeseries <- which (service_names == "arco-time-series" )
variables <- pluck (
services, "variables" , i_timeseries, "short_name" )
coordinates <- pluck (
services, "variables" , i_timeseries, "coordinates" )
coordinate_ids <- pluck (
coordinates, 1 , "coordinate_id" ) # assuming 1st variable
i_time <- which (coordinate_ids == "time" )
time_min <- pluck (
coordinates, 1 , "minimum_value" , i_time) |>
# convert from [milliseconds since 1970-01-01 00:00:00Z (no leap seconds)]
as.numeric () %>% {./ 1000 } |>
as.POSIXct (origin = "1970-01-01" , tz = "UTC" )
time_max <- pluck (
coordinates, 1 , "maximum_value" , i_time) |>
as.numeric () %>% {./ 1000 } |>
as.POSIXct (origin = "1970-01-01" , tz = "UTC" )
time_step_secs <- pluck (
coordinates, 1 , "step" , i_time) |>
as.numeric () %>%
{./ 1000 }
seq.POSIXt (
from = time_min,
to = time_max,
by = time_step_secs)
}
d_ds <- tibble (
dataset_id = meta |>
pluck ("datasets" ) |>
map_chr ("id" )) |>
mutate (
description = map (dataset_id, get_dataset_description),
version = map_chr (
description, pluck, "products" , "datasets" , 1 , "versions" , 1 , "label" ),
times = map (description, get_dataset_times),
time_min = map_vec (times, min),
time_max = map_vec (times, max),
n_times = map_int (times, length))
d_ds |>
select (dataset_id, version, time_min, time_max, n_times) |>
datatable (
caption = "Copernicus datasets: version and times. Notice transition from Daily (dataset_id = *phy_my*) to Interim, daily (dataset_id = *phy_myint*) at 2021-07-01." )
```
```{r}
#| label: existing_lyr_dates
#| eval: false
#| echo: false
system.time ({
d_tif <- tibble (
tif = dir_ls (dir_out, glob = "*.tif" , recurse = T)) |>
mutate (
lyrs = map (tif, \(x) names (rast (x))) ) # 18.387
})
lyrs <- d_tif$ lyrs[[1 ]]
rast (d_tif$ tif[1 ]) |>
cm_r_dflyrs ()
d_lyrs <- d_tif |>
# slice(c(1:3,(nrow(d_tif) - 2):nrow(d_tif))) |>
mutate (
d = map (tif, \(x) {
r <- rast (x)
cm_r_dflyrs (r) }) ) |>
unnest (d) |>
mutate (
nms = dirname (tif) |> basename (),
yr = year (time))
d_lyrs_sum <- d_lyrs |>
group_by (nms, yr) |>
summarize (
dataset = unique (dataset) |> paste (collapse = ',' ),
.groups = "drop" )
# select(-tif)
table (d_lyrs$ dataset)
# View(d_tif)
system.time ({
d_csv <- tibble (
csv = dir_ls (dir_out, glob = "*.csv" , recurse = T)) |>
mutate (
d = map (csv, read_csv) ) # 7.652
})
```
## Polygon dataset years
```{r}
#| label: d_nms_ds_t
d_ds_yr <- d_ds |>
mutate (
d_yrs = map (times, \(ts) {
yrs <- unique (year (ts))
tibble (
yr = yrs) |>
mutate (
time_min = map_vec (yr, \(yr) min (ts[year (ts) == yr])),
time_max = map_vec (yr, \(yr) max (ts[year (ts) == yr])),
n_times = map_int (yr, \(yr) length (ts[year (ts) == yr])) ) }) ) |>
select (- description, - times, - time_min, - time_max, - n_times) |>
unnest (d_yrs) |>
select (dataset_id, yr, time_min, time_max, n_times)
# d_ds_yr |>
# filter(yr == 2021)
d_nms_ds_yr_todo <- ply_sanctuaries |>
st_drop_geometry () |>
arrange (nms) |>
select (nms) |>
cross_join (
d_ds_yr) |>
mutate (
csv = glue ("{dir_out}/{nms}/{yr}.csv" ),
tif = glue ("{dir_out}/{nms}/{yr}.tif" ) ) |>
# filter(
# yr == 2021) |>
mutate (
d_csv = pmap (
list (tif, yr, dataset_id, time_min, time_max),
\(tif, yr, dataset_id, time_min, time_max, ...) { # nms = "TBNMS"; year = 2025
times_yr <- seq.POSIXt (time_min, time_max, "day" )
# TODO : generalize for month, 6-hr, etc from new meta/*.yml entry
# if missing tif, return all times
if (! file.exists (tif))
return (list (
time_min = min (times_yr),
time_max = max (times_yr),
n_times = length (times_yr)))
# get times from tif
r <- rast (tif)
times_tif <- unique (time (r))
# if all times done, return NAs
if (all (times_yr %in% times_tif))
return (list (
time_min = NA ,
time_max = NA ,
n_times = 0 ))
# otherwise, return time range missing
i <- ! times_yr %in% times_tif
list (
time_min = min (times_yr[i]),
time_max = max (times_yr[i]),
n_times = sum (i) ) })) |>
select (- time_min, - time_max, - n_times) |>
mutate (
time_min = map_vec (d_csv, pluck, "time_min" ),
time_max = map_vec (d_csv, pluck, "time_max" ),
n_times = map_int (d_csv, pluck, "n_times" )) |>
select (- d_csv) |>
filter (! is.na (time_min)) |>
rowid_to_column ("i" ) |>
relocate (i)
d_nms_ds_yr_todo |>
group_by (nms, dataset_id) %>%
{if (nrow (.) > 0 )
summarize (
.,
n_times = sum (n_times),
time_min = min (time_min, na.rm = T),
time_max = max (time_max, na.rm = T),
.groups = "drop" ) else .} |>
datatable (
caption = "Sanctuaries missing available Copernicus Marine times." ,
rownames = F,
options = list (
pageLength = 5 ,
lengthMenu = if ( nrow (d_nms_ds_yr_todo) > 5 ){
ceiling (seq.int (5 , nrow (d_nms_ds_yr_todo), length.out = 3 ))
} else {
c (5 ) } ) )
```
## Extract missing times in datasets per polygon year
Using the [ Copernicus Marine Toolbox ](https://help.marine.copernicus.eu/en/collections/9080063-copernicus-marine-toolbox) in R. See:
- [ How to download data via the Copernicus Marine Toolbox in R? | Copernicus Marine Help Center ](https://help.marine.copernicus.eu/en/articles/8638253-how-to-download-data-via-the-copernicus-marine-toolbox-in-r)
```{r}
#| label: iterate_subset
fxn <- function (i, dataset_id, nms, time_min, time_max, ...){
# i = 1; nms = "CBNMS"; dataset_id = "cmems_mod_glo_phy_my_0.083deg_P1D-m"
# time_min = as.POSIXct("1993-01-01", tz = "UTC"); time_max = as.POSIXct("1993-12-01", tz = "UTC")
# time_min = as.POSIXct("1993-12-02", tz = "UTC"); time_max = as.POSIXct("1993-12-31", tz = "UTC")
# d_nms_ds_yr |> slice(1) |> attach()
err <- tryCatch ({
log_info ("{sprintf('%03d', i)}: {nms}, {dataset_id}, {year(time_min)} ({time_min} to {time_max})" )
yr <- year (time_min)
r_nc <- glue ("{dir_out}/{nms}/{yr}.nc" )
r_tif <- path_ext_set (r_nc, "tif" )
p_csv <- path_ext_set (r_nc, "csv" )
bb <- bb_sanctuaries |>
filter (nms == !! nms) |>
st_bbox ()
p <- ply_sanctuaries |>
filter (nms == !! nms)
vars <- map_chr (meta$ variables, "id" )
dir_create (dirname (r_nc))
res <- cmt$ subset ( # py_help( cmt$subset)
dataset_id = dataset_id,
service = "timeseries" ,
variables = vars,
minimum_longitude = bb$ xmin,
maximum_longitude = bb$ xmax,
minimum_latitude = bb$ ymin,
maximum_latitude = bb$ ymax,
start_datetime = time_min,
end_datetime = time_max,
minimum_depth = meta$ depth$ min,
maximum_depth = meta$ depth$ max,
output_directory = dirname (r_nc),
output_filename = basename (r_nc),
overwrite = TRUE ,
coordinates_selection_method = "inside" ,
netcdf_compression_level = 1 ,
disable_progress_bar = TRUE )
# TODO : merge with existing
r <- rast (r_nc)
# set dataset_id into layer name
names (r) <- glue ("{dataset_id} {str_replace_all(names(r), '_',' ')}" )
# TODO :
# - write dataset_version into layer name
# - handle ?terra::depth()
if (file_exists (r_tif)){
r_tmp_tif <- tempfile (fileext = ".tif" )
r_tmp <- c (rast (r_tif), r)
r_tmp <- subset (r_tmp, which (! duplicated (names (r_tmp)))) # rm duplicates
writeRaster (r_tmp, r_tmp_tif)
file_delete (r_tif)
file_move (r_tmp_tif, r_tif)
rm (r); rm (r_tmp)
} else {
writeRaster (r, r_tif, overwrite = T)
}
file_delete (r_nc)
# zonal tif to csv ----
r <- rast (r_tif)
extract_tbl <- function (r, p, stat = "mean" ){
terra:: extract (r, p, fun = stat, na.rm= T) |>
pivot_longer (! ID, names_to = "lyr" ) |>
separate_wider_regex (
lyr, c (
dataset = "^ \\ S+" , " " ,
var = " \\ S+" ,
depth = "(?: \\ sdepth=[.0-9]+)?" ,
itime = " \\ s[0-9]+$" )) |>
mutate (
depth = str_replace (depth, "depth=" , "" ) |> as.numeric (),
itime = as.integer (itime),
stat = !! stat,
time = time (r)) |>
select (dataset, var, depth, time, stat, value)
}
bind_rows (
extract_tbl (r, p, "mean" ),
extract_tbl (r, p, "sd" )) |>
write_csv (p_csv, na = "" )
return (NA )
}, error = function (e) {
log_error (conditionMessage (e))
return (conditionMessage (e))
})
err
}
res <- d_nms_ds_yr_todo |>
# slice(1) |> # DEBUG
mutate (
error = pmap_chr (list (i, dataset_id, nms, time_min, time_max), fxn))
```
### Successes
```{r}
#| label: success_summary
res |>
mutate (
success = is.na (error)) |>
group_by (success) |>
summarize (
n = n ()) |>
datatable ()
```
### Errors (if any)
```{r}
#| label: error_summary
res |>
filter (! is.na (error)) |>
group_by (nms) |>
summarize (
n_years = n (),
errors = unique (error) |> paste (collapse = " \n\n ---- \n\n " )) |>
datatable ()
```
::: {.content-hidden}
## TODO
`ed_extract()` :
- [x] delete *_nc dir
- [x] differentiate existing done vs todo for given year
- [ ] allow irregular datasets, like `meta/irregular/*.yaml` : `ERROR: x cell sizes are not regular`
- [ ] break up into functions, not exported
`erddap.qmd` :
- [ ] make `ed_extract()` to single MBNMS with features for main vs david; add "ALL" option to `ed_extract()`
- [ ] add buffer to all (and redo)
- [ ] wrap retry with `ed_dim()` too
:::
```{r}
#| label: rm_empty_nc_dirs
#| eval: false
#| echo: false
# find empty directories ending in _nc and delete them
d_dirs <- tibble (
dir = list.dirs (here ("data" ), full.names = T, recursive = T)) |>
filter (str_detect (dir, "_nc$" )) |>
mutate (
n_files = map_int (dir, \(x) {length (list.files (x))}))
# show non-empty directories
d_dirs |>
filter (n_files > 0 ) |>
datatable ()
# delete empty directories
d_dirs |>
filter (n_files == 0 ) |>
pull (dir) |>
walk (\(x) {
message (glue ("Deleting {x}" ))
unlink (x, recursive = T) })
```
::: {.callout-caution collapse="true"}
## R package versions
```{r}
devtools:: session_info ()
```
:::